Below is a list of the top comutating genes with KRAS in CRC. The last four columns have the number of tunor samples with the various combination of mutations; for example, G mut & K WT has the number of tumors with the other gene (G) mutated and KRAS (K) WT.
nonallele_specific_increased_comutation_df %.% {
filter(cancer == "COAD" & hugo_symbol != "KRAS")
filter(p_value < 0.01)
arrange(p_value)
mutate(
p_value = scales::scientific(p_value, digits = 3),
odds_ratio = round(odds_ratio, digits = 3),
geneWT_krasWT = map_dbl(comut_ct_tbl, ~ .x[1, 1]),
geneMut_krasWT = map_dbl(comut_ct_tbl, ~ .x[2, 1]),
geneWT_krasMut = map_dbl(comut_ct_tbl, ~ .x[1, 2]),
geneMut_krasMut = map_dbl(comut_ct_tbl, ~ .x[2, 2]),
)
select(hugo_symbol, p_value, odds_ratio, tidyselect::starts_with("gene"))
rename(
gene = hugo_symbol,
`p-value` = p_value,
`odds ratio` = odds_ratio,
`G WT & K WT` = geneWT_krasWT,
`G mut & K WT` = geneMut_krasWT,
`G WT & K mut` = geneWT_krasMut,
`G mut & K mut` = geneMut_krasMut
)
}
The data used for this analysis was the RNA expression data from TCGA-COAD. This data set has RNA expression for 525 tumor samples. 78 of these samples are hypermutants; these samples were removed from the analysis.
main_alleles <- c("WT", "KRAS_G12A", "KRAS_G12C", "KRAS_G12D", "KRAS_G12V", "KRAS_G13D")
dusp_rna_data <- tcga_coad_rna %.% {
filter(str_detect(hugo_symbol, "DUSP"))
filter(!is_hypermutant)
select(-is_hypermutant)
mutate(
allele = ifelse(ras_allele %in% !!main_alleles, ras_allele, "Other"),
allele = str_remove(allele, "KRAS_"),
allele = factor_alleles(allele)
)
}
# Put DUSP genes in order.
dusp_order <- unique(dusp_rna_data$hugo_symbol)
dusp_num <- as.numeric(str_remove_all(dusp_order, "[:alpha:]"))
dusp_order <- dusp_order[order(dusp_num)]
dusp_rna_data$hugo_symbol <- factor(dusp_rna_data$hugo_symbol, levels = dusp_order)
Below is a sample of the RNA expression data table.
dusp_rna_data %>%
rename(
`DUSP` = hugo_symbol,
`tumor sample` = tumor_sample_barcode,
`RNA (RSEM)` = rna_expr
) %>%
select(-ras_allele) %>%
head() %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| DUSP | tumor sample | RNA (RSEM) | allele |
|---|---|---|---|
| DUSP10 | TCGA-3L-AA1B | 254.352 | WT |
| DUSP10 | TCGA-4N-A93T | 133.527 | G12D |
| DUSP10 | TCGA-4T-AA8H | 275.635 | G12V |
| DUSP10 | TCGA-5M-AAT4 | 507.745 | G12D |
| DUSP10 | TCGA-5M-AAT5 | 572.762 | G12D |
| DUSP10 | TCGA-5M-AATA | 209.145 | WT |
The number of tumor samples with missing data for each DUSP gene.
dusp_rna_data %>%
filter(is.na(rna_expr)) %>%
count(hugo_symbol, sort = TRUE) %>%
pivot_wider(names_from = hugo_symbol, values_from = n) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE,
position = "left"
)
| DUSP13 | DUSP21 |
|---|---|
| 149 | 149 |
The number of tumor samples with 0 RNA expression values for each DUSP gene.
dusp_rna_data %>%
filter(rna_expr <= 0) %>%
count(hugo_symbol, sort = TRUE) %>%
pivot_wider(names_from = hugo_symbol, values_from = n) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE,
position = "left"
)
| DUSP21 | DUSP13 | DUSP5P | DUSP26 | DUSP9 | DUSP27 | DUSP15 |
|---|---|---|---|---|---|---|
| 284 | 217 | 73 | 58 | 34 | 11 | 2 |
All negative RNA expression values were set to 0.
dusp_rna_data %<>% mutate(rna_expr = map_dbl(rna_expr, ~ max(0, .x)))
dusp_rna_data %>%
filter(!is.na(rna_expr)) %>%
mutate(rna_expr = rna_expr + 1) %>%
ggplot(aes(x = allele, y = rna_expr)) +
facet_wrap(~hugo_symbol, scales = "free_y") +
geom_boxplot(outlier.shape = NA) +
scale_y_continuous(trans = "log10") +
theme(
panel.grid.major.x = element_blank(),
axis.text = element_text(size = 7),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
labs(x = NULL, y = "RNA expression (log10 + 1)")
DUSP13 and DUSP21 were removed from the analysis because they were missing data and had very low expression levels.
IGNORE_DUSPS <- c("DUSP13", "DUSP21")
dusp_rna_data %<>% filter(!hugo_symbol %in% IGNORE_DUSPS)
plot_dusp_distribtions <- function(df, x) {
df %>%
ggplot(aes(x = {{ x }})) +
facet_wrap(~hugo_symbol, scales = "free") +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
geom_density() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text = element_text(size = 6),
axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(size = 7),
panel.spacing = unit(1, "mm")
) +
labs(x = "RNA expression")
}
plot_dusp_distribtions(dusp_rna_data, rna_expr) +
ggtitle("Non-transformed RNA expression values")
dusp_rna_data %>%
mutate(rna_expr = log10(rna_expr + 1)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("log10(RNA + 1) transformed data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) transformed data")
dusp_rna_data %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("Centralized and scaled (z-scale) data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) & z-scaled data")
The \(\log_{10}(\text{RNA} + 1)\), centralized, and scaled values will be used for the analysis.
dusp_rna_data %<>%
mutate(log10_rna_expr = log10(rna_expr + 1)) %>%
group_by(hugo_symbol) %>%
mutate(log10_z_rna = scale_numeric(log10_rna_expr)) %>%
ungroup()
dusp_corr <- dusp_rna_data %>%
pivot_wider(id = tumor_sample_barcode, names_from = hugo_symbol, values_from = log10_z_rna) %>%
select(-tumor_sample_barcode) %>%
corrr::correlate()
dusp_corr_pheat <- dusp_corr %>%
as.data.frame() %>%
column_to_rownames("rowname")
colnames(dusp_corr_pheat) <- str_remove(colnames(dusp_corr_pheat), "DUSP")
rownames(dusp_corr_pheat) <- str_remove(rownames(dusp_corr_pheat), "DUSP")
pheatmap::pheatmap(
dusp_corr_pheat,
border_color = NA,
na_col = "white",
main = "Correlation of DUSP gene expression",
)
corrr::network_plot(dusp_corr, min_cor = 0.3) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "DUSP gene expression correlation network")
new_allele_order <- as.character(sort(unique(dusp_rna_data$allele)))
new_allele_order <- c("WT", new_allele_order[new_allele_order != "WT"])
data <- dusp_rna_data %>%
mutate(
grp_allele = case_when(
allele == "G13D" ~ "G13D",
allele == "WT" ~ "WT",
TRUE ~ "KRAS"
),
grp_allele = factor(grp_allele, levels = c("WT", "G13D", "KRAS")),
allele = factor(as.character(allele), levels = new_allele_order)
)
# FOR TESTING
if (FALSE) {
set.seed(0)
TESTING_DUSPS <- paste0("DUSP", 1:6)
TESTING_TSBS <- sample(unique(data$tumor_sample_barcode), 100)
data <- data %.% {
filter(hugo_symbol %in% TESTING_DUSPS)
filter(tumor_sample_barcode %in% TESTING_TSBS)
}
}
stash("lm_stan_hier", depends_on = "data", {
lm_stan_hier <- stan_glmer(
log10_z_rna ~ 1 + (1 + allele | hugo_symbol),
data = data,
prior = normal(location = 0, scale = 5),
prior_intercept = normal(location = 0, scale = 2),
prior_aux = exponential(rate = 1),
prior_covariance = decov(regularization = 1, concentration = 1, shape = 1, scale = 1)
)
})
#> Loading stashed object.
Trace plots for the global intercept and standard deviation \(\sigma\).
mcmc_trace(lm_stan_hier, pars = "(Intercept)") /
mcmc_trace(lm_stan_hier, pars = "sigma")
Trace plots for parameters for DUSP1.
mcmc_trace(lm_stan_hier, regex_pars = "DUSP1\\]") +
scale_x_continuous(expand = c(0, 0))
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
pp_check(lm_stan_hier, plotfun = "stat", stat = "mean") +
ggtitle("Distrbition of error of the posterior predictions")
pp_check(lm_stan_hier, plotfun = "stat_2d", stat = c("mean", "sd")) +
ggtitle("Standard deviation and mean of the error of the posterior predictions")
lm_dusp_post <- as.data.frame(lm_stan_hier) %.% {
mutate(draw = row_number())
select(draw, `(Intercept)`, tidyselect::contains("DUSP"))
pivot_longer(
-c(draw, `(Intercept)`),
names_to = "parameter",
values_to = "value"
)
mutate(
parameter = str_remove_all(parameter, "[:punct:]"),
parameter = str_remove_all(parameter, "b|allele|hugosymbol|")
)
separate(parameter, c("allele", "dusp"), sep = " ")
mutate(allele = str_replace(allele, "Intercept", "WT"))
}
lm_dusp_post %>%
ggplot(aes(x = value)) +
facet_wrap(~dusp, ncol = 4, scales = "free") +
geom_vline(
xintercept = 0,
lty = 2,
color = "grey50",
size = 0.9
) +
geom_density(aes(color = allele), size = 1) +
scale_color_manual(values = short_allele_pal) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
labs(
x = "posterior distribution",
y = "probability density",
color = "KRAS allele"
)
lm_dusp_post %.%
{
group_by(dusp, allele)
summarise(
avg = median(value),
hdi_50 = list(unlist(ggdist::hdi(value, .width = 0.50))),
hdi_89 = list(unlist(ggdist::hdi(value, .width = 0.89))),
)
ungroup()
mutate(
hdi_50_lower = map_dbl(hdi_50, ~ .x[[1]]),
hdi_50_upper = map_dbl(hdi_50, ~ .x[[2]]),
hdi_89_lower = map_dbl(hdi_89, ~ .x[[1]]),
hdi_89_upper = map_dbl(hdi_89, ~ .x[[2]])
)
select(-hdi_50, -hdi_89)
} %>%
ggplot(aes(x = avg, y = allele, color = allele)) +
facet_wrap(~dusp, scales = "free_x") +
geom_rect(
xmin = -0.1, xmax = 0.1, ymin = Inf, ymax = -Inf,
fill = "grey80",
color = NA,
alpha = 0.1
) +
geom_vline(xintercept = 0, color = "grey50") +
geom_point(size = 2) +
geom_linerange(
aes(xmin = hdi_50_lower, xmax = hdi_50_upper),
size = 1.2,
alpha = 0.8
) +
geom_linerange(
aes(xmin = hdi_89_lower, xmax = hdi_89_upper),
size = 0.9,
alpha = 0.5
) +
scale_color_manual(values = short_allele_pal, drop = TRUE) +
theme(legend.position = "none") +
labs(
x = "posterior distributions",
y = NULL
)
bayestestR::describe_posterior(lm_stan_hier, effects = "all") %>%
arrange(Effects) %>%
as_tibble() %>%
mutate(
Parameter = str_remove_all(Parameter, "[:punct:]"),
Parameter = str_remove_all(Parameter, "^b|allele|hugosymbol"),
) %>%
mutate_if(is.numeric, round, digits = 2)
#> Possible multicollinearity between Sigma[hugo_symbol:alleleG12D,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.79), Sigma[hugo_symbol:alleleG12V,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.71), Sigma[hugo_symbol:alleleG12D,alleleG12D] and Sigma[hugo_symbol:alleleG12D,(Intercept)] (r = 0.77), Sigma[hugo_symbol:alleleG12V,alleleG12V] and Sigma[hugo_symbol:alleleG12V,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG12D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG13D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleOther,alleleG12D] and Sigma[hugo_symbol:alleleOther,(Intercept)] (r = 0.74). This might lead to inappropriate results. See 'Details' in '?rope'.